import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
%matplotlib inline
import plotly.offline as py
py.offline.init_notebook_mode(connected=True)
data = pd.read_csv("/home/kla/Documents/tests/data_coordinates.csv", delimiter=",", header=0, error_bad_lines=False, low_memory=True)
data.shape
data.head(5)
#separating the coordinates column into latitude and longitude
coords = data["coordinates"].str[1:-1].str.split(',', expand=True).astype(float)
coords.columns = ["lat", "long"]
data = pd.concat([data, coords], axis=1)
#transforming dates to datetime objects, parsing them, adding day of week column
data["visit_start"] =pd.to_datetime(data["visit_start"])
data['visit_start_date'] = pd.to_datetime(data['visit_start']).dt.date
data['visit_start_hour'] = pd.to_datetime(data['visit_start']).dt.time
data['visit_end_date'] = pd.to_datetime(data['visit_end']).dt.date
data['visit_end_hour']=pd.to_datetime(data['visit_end']).dt.time
data['visit_day'] = pd.to_datetime(data['visit_start']).dt.day_name()
#ordering the days of the week
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
data['visit_day'] = pd.Categorical(data['visit_day'], categories=days, ordered=True)
data.head(3)
#Due to the large file and the slowness of my laptop, I'm sampling only a quarter of the dataset
data = data.sample(frac=.25)
data.describe()
data.shape
sector_sequence = {"sector_name": ["Baumarkt", "Discounter", "Möbelhäuser", "Mode", "Fast Food"]}
color_sequence = ["red", "green", "blue", "goldenrod", "pink"]
def germany_count():
sample_data = data.sample(n=50000, random_state=1)
fig = px.scatter_mapbox(sample_data, lat="lat", lon="long", hover_name="store_city", color="sector_name", hover_data=["sector_name", "visit_duration_minutes"],
color_discrete_sequence=color_sequence, category_orders= sector_sequence, zoom=5, height=500)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
py.offline.init_notebook_mode(connected=True)
germany_count()
py.offline.init_notebook_mode(connected=True)
def berlin_count():
is_berlin = data["store_federal_state"] =="Berlin"
berlin = data[is_berlin]
fig = px.scatter_mapbox(berlin, lat="lat", lon="long",color="sector_name", size="visit_duration_minutes", color_discrete_sequence=color_sequence,
category_orders=sector_sequence, size_max=8, zoom=8)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":1,"t":1,"l":2,"b":2})
fig.show()
py.offline.init_notebook_mode(connected=True)
berlin_count()
py.offline.init_notebook_mode(connected=True)
def hamburg_count():
is_hamburg= data["store_federal_state"] =="Hamburg"
hamburg = data[is_hamburg]
fig = px.scatter_mapbox(hamburg, lat="lat", lon="long",color="sector_name", size="visit_duration_minutes", color_discrete_sequence=color_sequence,
category_orders=sector_sequence, size_max=8, zoom=8)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":1,"t":1,"l":2,"b":2})
fig.show()
py.offline.init_notebook_mode(connected=True)
hamburg_count()
py.offline.init_notebook_mode(connected=True)
def bayern_count():
is_bayern= data["store_federal_state"] =="Bayern"
bayern = data[is_bayern]
fig = px.scatter_mapbox(bayern, lat="lat", lon="long",color="sector_name", size="visit_duration_minutes", color_discrete_sequence=color_sequence,
category_orders=sector_sequence, size_max=8, zoom=5)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":1,"t":1,"l":2,"b":2})
fig.show()
py.offline.init_notebook_mode(connected=True)
bayern_count()
py.offline.init_notebook_mode(connected=True)
def visit_over_time():
data_user = data.groupby(["visit_start_date"])['user_id'].count()
plt.figure(figsize=(20,8))
plt.ylabel("volume of visits")
plt.xlabel("date of visits")
plt.annotate('start of lockdown in Germany',
(data_user.index[59], data_user[59]),
xytext=(20, 20),
textcoords='offset points')
plt.plot(data_user)
plt.show()
py.offline.init_notebook_mode(connected=True)
visit_over_time()
def visit_breakdown():
data_user = data.groupby(["visit_start_date"])['user_id'].count()
fig, ax = plt.subplots(figsize=(20,10))
plt.ylabel("volume of visit")
data.groupby(['visit_start_date', 'sector_name'])['sector_name'].count().unstack().plot(ax=ax, stacked=True, kind="area")
plt.annotate('start of lockdown in Germany',
(data_user.index[59], data_user[59]),
xytext=(20, 20),
textcoords='offset points')
py.offline.init_notebook_mode(connected=True)
visit_breakdown()
stores = data.groupby(["sector_name", "visit_day"])['user_id'].count().unstack()
stores
def city_breakdown():
data_user = data.groupby(["visit_start_date"])['user_id'].count()
fig, ax = plt.subplots(figsize=(20,10))
plt.ylabel("number of visits")
plt.annotate('start of lockdown in Germany',
(data_user.index[59], data_user[59]),
xytext=(20, 20),
textcoords='offset points')
data.groupby(['visit_start_date', 'store_federal_state'])['store_federal_state'].count().unstack().plot(ax=ax, stacked=True, kind="area")
py.offline.init_notebook_mode(connected=True)
city_breakdown()
fig, ax = plt.subplots(figsize=(20,8))
plt.ylabel("number of visits")
data.groupby(['visit_day', 'store_federal_state'])['store_federal_state'].count().unstack().plot(ax=ax, stacked=False, kind="bar")
data.groupby(["sector_name" ])['visit_duration_minutes'].aggregate(np.mean).reset_index()
data.groupby(["store_federal_state" ])['visit_duration_minutes'].aggregate(np.mean).reset_index()
store_day = data.groupby(["sector_name", "visit_day", "store_federal_state" ])['visit_duration_minutes'].aggregate(np.mean).reset_index()
fig, ax = plt.subplots(figsize=(20,8))
ax= sns.swarmplot(x="visit_day", y="visit_duration_minutes", hue="sector_name", data=store_day, size=10)
ax.set_title("average duration of visit across days by sector")
fig, ax = plt.subplots(figsize=(20,5))
ax= sns.swarmplot(x="visit_day", y="visit_duration_minutes", hue="store_federal_state", data=store_day, size=10)
ax.set_title("average duration of visits by day broken down by state")
population = pd.read_csv("/home/kla/Documents/plz_einwohner.csv", delimiter=",", header=0, error_bad_lines=False, low_memory=True)
population.columns = ["store_zip", "population"]
data_pop = data.merge(population, on="store_zip", how="left")
data_pop.head(3)
corr = data_pop.corr()
plt.figure(figsize = (10, 8))
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values,
linewidths=2,
annot=True,
cmap='YlGnBu')